############################################################
## R script to run some actual regressions. Output ist mostly
## written directly to files which are import()-ed into latex file
##
## Chris path '/Users/creidl/Dropbox/Gov 2001 project/'
## Frank path 'C:/Users/fnagle/Documents/My Dropbox/Gov 2001 project/'
##
## Change Log
##     	2012-03-12 started initial version (Frank + Chris)
##	 	2012-03-14 added clustered SE's, fixed random effects model (Frank)
##		2012-03-15 added robust SE's
##		2012-03-16 Added apsrtable framework to produce nice table with multiple models side-by-side (Chris)
##					 Messed with tables during meeting (Frank + Chris)
##					 added code for summary statistics and clean-up (Chris)
##		2012-03-17 Changed layout / ordering of models and variables. Moved COMPETITON to controls
##      2012-03-21 Updated all tables with new dependent variable DEP2 (Chris)
##                 Added additional table with random effects model (Chris)
##      2012-03-22 Clean-up and commenting to prepare for submission
##		2012-04-16 Stream-lined code; merged in 2009 data; produce final figure for Gary paper (Chris)
############################################################

# Load required libraries
library(lme4)
library(Zelig)
library(lmtest)
library(xtable)
library(apsrtable)		# For combining/printing tables of multiple OLS models
library(sandwich)
library(memisc)			# For combining/printing tables of multiple lmer models 


# Some helper code has been moved to other source. Load them.
source("/Users/creidl/Dropbox/T1D/04-dataAnalysis-ratings/corstarsl.R")
source("/Users/creidl/Dropbox/Gov 2001 project/R_dataAnalysis/HelperFunctions.R")


# Read Data
setwd('/Users/creidl/Dropbox/Gov 2001 project/R_dataCollection/')
m <- rbind( read.csv(file="data/master-data-DAY-2007.csv", head=TRUE, sep=","),
			read.csv(file="data/master-data-DAY-2008.csv", head=TRUE, sep=","),
			read.csv(file="data/master-data-DAY-2009.csv", head=TRUE, sep=",") )


#######################################
### Figure 1: Bi-Modal Distribution
#######################################
setwd('/Users/creidl/Dropbox/Gov 2001 project/R_dataCollection/')
a <- rbind(
	read.csv(file="data/FULL-REVIEWS-DAY-2007.csv", head=TRUE, sep=","),
	read.csv(file="data/FULL-REVIEWS-DAY-2008.csv", head=TRUE, sep=","),
	read.csv(file="data/FULL-REVIEWS-DAY-2009.csv", head=TRUE, sep=",") )
a <- a[a$DAY > 0 & a$DAY < 36,]

#ggplot(allReviews, aes(x=reviewRating)) + geom_histogram() + scale_y_log10() 
#ggsave("plot.pdf", width=4, height=4)

# setwd('/Users/creidl/Dropbox/Gov 2001 project/our-paper/')
# pdf("figures/Rplot-Rating-Distribution-allReviews.pdf")
# hist(a$reviewRating, col="black", xlab="Star-Rating", ylab="Count", main="Histogram of Star-Rating Distribution")
# dev.off()

## Variation including the fitted mixed model
mix <- normalmixEM(a$reviewRating)

setwd('/Users/creidl/Dropbox/Gov 2001 project/our-paper/')
pdf("figures/Rplot-Rating-Distribution-allReviewsALT.pdf", width=5, height=5)
hist(a$reviewRating, xlab="Star-Rating", ylab="Density", main="Histogram of Star-Rating Distribution", probability=TRUE)
curve(dnorm(x, mean=mix$mu[1], sd=mix$sigma[1]), col=cols[1], lwd=lw, add=TRUE)
curve(dnorm(x, mean=mix$mu[2], sd=mix$sigma[2]), col=cols[2], lwd=lw, add=TRUE)
dev.off()


##################################################
## Figure 2: Hump-shaped decay of movie reviews
##################################################
x <- aggregate(m[c("VOL")], m[c("DAY")], sum)
y <- aggregate(m[c("BOX")], m[c("DAY")], sum)
saturdaysX <- x[x$DAY %in% c(2, 9, 16, 23, 30),]
saturdaysY <- y[y$DAY %in% c(2, 9, 16, 23, 30),]

cols=c("#00A600FF", "#FF4000FF")
lw=2
# https://stat.ethz.ch/pipermail/r-help/2003-January/028837.html
# http://rwiki.sciviews.org/doku.php?id=tips:graphics-base:2yaxes
#text(saturdaysY, c("Sat WK1", "Sat WK2", "Sat WK3", "Sat WK4", "Sat WK5"), pos=3)
setwd('/Users/creidl/Dropbox/Gov 2001 project/our-paper/')
pdf("figures/Rplot-hump.pdf", width=5, height=5)
## add extra space to right margin of plot within frame
par(mar=c(4, 4, 3, 5))
plot(x, type="l", main="Volume of Posted Online Reviews and\nBox Office Revenue per Day", axes=FALSE, ylab="Volume of Online Reviews (in Thousands)", xlab="Day (marks highlight Saturdays)", col=cols[1], lwd=lw)
points(saturdaysX, pch=1, col=cols[1], lwd=lw)
#text(saturdaysX, c("Sat WK1", "Sat WK2", "Sat WK3", "Sat WK4", "Sat WK5"), pos=3)
axis(2, pretty(range(x$VOL)))

par(new=T)

plot(y, axes=F, ylab="", xlab="", type="l", col=cols[2], lwd=lw)
axis(4, pretty(range(y$BOX)))
points(saturdaysY, pch=2, col=cols[2], lwd=lw)
axis(1,pretty(range(x$DAY)))
mtext("Box Office Revenue (in Millions)", side=4, line=4) 

legend("topright", c("Volume of online reviews", "Box office revenue"), pch=c(1, 2), col=cols, title="Legend", inset=.05, lwd=lw)
dev.off()



##################################################
## Figure 3: Average deviation of movie i’s mean rating from the mean rating of all ratings.
##################################################
setwd('/Users/creidl/Dropbox/Gov 2001 project/R_dataCollection')
a <- read.csv(file="data/FULL-2007-2008-2009-DIS-AND-AVG.csv", head=TRUE, sep=",")
a$AVG <- a$sum / a$seq
a <- a[!is.na(a$AVG),]		# Drop the first review of each movie which doesn't have a lagged AVG
byDay <- aggregate(a[c("AVG")], a[c("movieID", "DAY")], mean)
c <- c()
for(i in 1:35) {
	c <- c(c, mean(byDay$AVG[byDay$DAY<=i]))
}
setwd('/Users/creidl/Dropbox/Gov 2001 project/our-paper/')
pdf("figures/Rplot-CumulativeMeanRatingByDay.pdf", width=5, height=5)
plot(c, type="l", col=cols[1], lwd=lw, main="Cumulative Mean Rating", xlab="Day", ylab="Cumulative Mean Rating")
dev.off()

